import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm
import seaborn as sns
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import statsmodels.discrete.discrete_model as sm
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from pandas_profiling import ProfileReport
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#
CRisk = pd.read_csv(r"C:/Users/anten/OneDrive/Desktop/CIND 820 Project/Credit Risk Dataset/credit_risk_dataset.csv")
CRisk.head()
| person_age | person_income | person_home_ownership | person_emp_length | loan_intent | loan_grade | loan_amnt | loan_int_rate | loan_status | loan_percent_income | cb_person_cred_hist_length | cb_person_default_on_file | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 22 | 59000 | RENT | 123.0 | PERSONAL | D | 35000 | 16.02 | 1 | 0.59 | 3 | Y |
| 1 | 21 | 9600 | OWN | 5.0 | EDUCATION | B | 1000 | 11.14 | 0 | 0.10 | 2 | N |
| 2 | 25 | 9600 | MORTGAGE | 1.0 | MEDICAL | C | 5500 | 12.87 | 1 | 0.57 | 3 | N |
| 3 | 23 | 65500 | RENT | 4.0 | MEDICAL | C | 35000 | 15.23 | 1 | 0.53 | 2 | N |
| 4 | 24 | 54400 | RENT | 8.0 | MEDICAL | C | 35000 | 14.27 | 1 | 0.55 | 4 | Y |
# Exploratory Data Analysis
Profile = ProfileReport(CRisk, title = 'Credit Risk dataset profiling Report', html ={'style':{'full_width': True}})
Profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
# Checking outlier among attributes
# define a function called "plot_boxplot"
def plot_boxplot(df, ft):
df.boxplot(column=[ft])
plt.grid(False)
plt.show()
plot_boxplot(CRisk, "person_age")
plot_boxplot(CRisk, "person_income")
plot_boxplot(CRisk, "person_emp_length")
plot_boxplot(CRisk, "loan_int_rate")
plot_boxplot(CRisk, "loan_percent_income")
plot_boxplot(CRisk, "cb_person_cred_hist_length")
plot_boxplot(CRisk, "loan_amnt")
# define a function called "outlers" which returns a list of index of outliers
def outliers(df, ft):
Q1 = df[ft].quantile(0.25)
Q3 = df[ft].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
ls = df.index[(df[ft] < lower_bound) | (df[ft] > upper_bound)]
return ls
index_list = []
for attribute in ["person_age", "person_income","person_emp_length", "loan_amnt", "loan_int_rate", "loan_percent_income",
"cb_person_cred_hist_length"]:
index_list.extend(outliers(CRisk, attribute))
# define a function called 'remove' which returns a cleaned dataframe without outliers
def remove(df, ls):
ls = sorted(set(ls))
df = df.drop(ls)
return df
CRisk = remove(CRisk, index_list)
CRisk.shape
(27024, 12)
# Training set used to train moel
# Validation set used to evaluate the model during training, tune model hyperparameters
# test set usded to compare different modlels o and to report the model accuracy
train_val_df, test_df = train_test_split(CRisk, test_size = 0.2 )
train_df, val_df = train_test_split(train_val_df, test_size = 0.25)
print('train_df.shape:' , train_df.shape)
print('val_df.shape:' , val_df.shape)
print('test_df.shape:' , test_df.shape)
train_df.shape: (16214, 12) val_df.shape: (5405, 12) test_df.shape: (5405, 12)
# Selecting only the important columns (inputs for modeling)
input_cols = list(train_df[['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent',
'loan_grade', 'loan_amnt', 'loan_int_rate','loan_percent_income',
'cb_person_cred_hist_length', 'cb_person_default_on_file' ]])
print(input_cols)
['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'cb_person_default_on_file']
# target columns
target_col = 'loan_status'
print(target_col)
loan_status
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()
val_inputs =val_df[input_cols].copy()
val_targets = val_df[target_col].copy()
test_inputs = test_df[input_cols].copy()
test_targets = val_df[target_col].copy()
train_inputs = train_df[input_cols]
train_targets = train_df[target_col]
val_inputs =val_df[input_cols]
val_targets = val_df[target_col]
test_inputs = test_df[input_cols]
test_targets = val_df[target_col]
numeric_cols = train_inputs.select_dtypes(include = np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()
numeric_cols
['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
categorical_cols
['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
train_inputs[categorical_cols].nunique()
person_home_ownership 4 loan_intent 6 loan_grade 7 cb_person_default_on_file 2 dtype: int64
# checking missing value in train set
train_inputs[numeric_cols].isna().sum()
person_age 0 person_income 0 person_emp_length 495 loan_amnt 0 loan_int_rate 1532 loan_percent_income 0 cb_person_cred_hist_length 0 dtype: int64
# checking missing value in test set
test_inputs [numeric_cols].isna().sum()
person_age 0 person_income 0 person_emp_length 141 loan_amnt 0 loan_int_rate 495 loan_percent_income 0 cb_person_cred_hist_length 0 dtype: int64
# Missing values handeling using Sklearn impute from from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'mean')
imputer.fit(CRisk[numeric_cols])
SimpleImputer()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SimpleImputer()
list(imputer.statistics_)
[26.38306690349319, 57535.372372705744, 4.353270814272644, 8418.799030491415, 10.919090240013084, 0.16262877442273535, 5.0016281823564235]
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])
train_inputs[numeric_cols].isna().sum()
person_age 0 person_income 0 person_emp_length 0 loan_amnt 0 loan_int_rate 0 loan_percent_income 0 cb_person_cred_hist_length 0 dtype: int64
# Feature Scaling using from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(CRisk[numeric_cols])
MinMaxScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MinMaxScaler()
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])
train_inputs[numeric_cols].describe()
| person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_percent_income | cb_person_cred_hist_length | |
|---|---|---|---|---|---|---|---|
| count | 16214.000000 | 16214.000000 | 16214.000000 | 16214.000000 | 16214.000000 | 16214.000000 | 16214.000000 |
| mean | 0.318573 | 0.395210 | 0.311921 | 0.351804 | 0.335310 | 0.353428 | 0.231813 |
| std | 0.210992 | 0.196427 | 0.238465 | 0.216205 | 0.187395 | 0.216335 | 0.229107 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.150000 | 0.249551 | 0.142857 | 0.195556 | 0.151961 | 0.186047 | 0.076923 |
| 50% | 0.250000 | 0.361691 | 0.285714 | 0.311111 | 0.336954 | 0.302326 | 0.153846 |
| 75% | 0.450000 | 0.508519 | 0.428571 | 0.501667 | 0.468137 | 0.488372 | 0.384615 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
# from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(CRisk[numeric_cols])
array([[-1.27613037, -1.79695308, 0.19141909, ..., 0.06897353,
-0.6722018 , -1.00988286],
[-1.27613037, -1.78570698, -0.69652179, ..., -1.17992614,
0.93776536, -1.00988286],
[-1.27613037, -1.78195828, 0.48739939, ..., 1.19298324,
-0.02821494, -0.67343784],
...,
[ 3.2280821 , -0.91975739, -0.69652179, ..., nan,
-1.42351981, 3.36390242],
[ 2.51689066, 1.21700134, 1.96730086, ..., -1.3422831 ,
-1.31618867, 3.36390242],
[ 2.75395447, 1.96674125, 0.19141909, ..., 0.33436471,
-0.13554608, 2.69101237]])
# Encoding Categorical Variable using OneHotEncoder
CRisk[categorical_cols].nunique()
person_home_ownership 4 loan_intent 6 loan_grade 7 cb_person_default_on_file 2 dtype: int64
CRisk["person_home_ownership"].value_counts()
RENT 14221 MORTGAGE 10596 OWN 2123 OTHER 84 Name: person_home_ownership, dtype: int64
CRisk["loan_intent"].value_counts()
EDUCATION 5537 MEDICAL 5061 VENTURE 4763 PERSONAL 4490 DEBTCONSOLIDATION 4314 HOMEIMPROVEMENT 2859 Name: loan_intent, dtype: int64
CRisk["loan_grade"].value_counts()
A 9219 B 8550 C 5510 D 2835 E 729 F 150 G 31 Name: loan_grade, dtype: int64
CRisk["cb_person_default_on_file"].value_counts()
N 22343 Y 4681 Name: cb_person_default_on_file, dtype: int64
encoder = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
encoder.fit(CRisk[categorical_cols])
OneHotEncoder(handle_unknown='ignore', sparse=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
OneHotEncoder(handle_unknown='ignore', sparse=False)
encoder.categories_
[array(['MORTGAGE', 'OTHER', 'OWN', 'RENT'], dtype=object),
array(['DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL',
'PERSONAL', 'VENTURE'], dtype=object),
array(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype=object),
array(['N', 'Y'], dtype=object)]
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
print(encoded_cols)
['person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER', 'person_home_ownership_OWN', 'person_home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A', 'loan_grade_B', 'loan_grade_C', 'loan_grade_D', 'loan_grade_E', 'loan_grade_F', 'loan_grade_G', 'cb_person_default_on_file_N', 'cb_person_default_on_file_Y']
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])
## Train_inputs = X_train, train_targets = Y_train and test_inputs = X_test , and test_target = Y_test )
print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)
train_inputs: (16214, 30) train_targets: (16214,) val_inputs: (5405, 30) val_targets: (5405,) test_inputs: (5405, 30) test_targets: (5405,)
# Logistic regression is a commonly used techniques for solving binary classification problems
model = LogisticRegression(solver = 'liblinear')
model.fit(train_inputs[numeric_cols + encoded_cols], train_targets)
LogisticRegression(solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(solver='liblinear')
print(numeric_cols + encoded_cols)
['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER', 'person_home_ownership_OWN', 'person_home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A', 'loan_grade_B', 'loan_grade_C', 'loan_grade_D', 'loan_grade_E', 'loan_grade_F', 'loan_grade_G', 'cb_person_default_on_file_N', 'cb_person_default_on_file_Y']
# Postive and Negative predicters of deafult
print(model.coef_.tolist())
[[-0.3966632588158861, -0.349461744682226, -0.22479355317937583, -3.178825897398039, 1.0716674803028918, 6.043505575968809, 0.13774254214615003, -0.0018494442477760297, 0.1144859084628128, -1.7104156664478034, 0.6135744923117117, 0.19794626735315135, -0.5870843514028505, 0.4876678251023244, 0.011942042170631528, -0.2907054486905857, -0.8039710444555295, -1.9895767731684757, -1.7477572926961862, -1.5671262467873217, 0.4479693550089058, 0.7561135922390441, 0.8495202208734832, 2.266652434607905, -0.5309815984648207, -0.4532231114575803]]
print(model.intercept_)
[-0.98420471]
print(model.feature_names_in_)
['person_age' 'person_income' 'person_emp_length' 'loan_amnt' 'loan_int_rate' 'loan_percent_income' 'cb_person_cred_hist_length' 'person_home_ownership_MORTGAGE' 'person_home_ownership_OTHER' 'person_home_ownership_OWN' 'person_home_ownership_RENT' 'loan_intent_DEBTCONSOLIDATION' 'loan_intent_EDUCATION' 'loan_intent_HOMEIMPROVEMENT' 'loan_intent_MEDICAL' 'loan_intent_PERSONAL' 'loan_intent_VENTURE' 'loan_grade_A' 'loan_grade_B' 'loan_grade_C' 'loan_grade_D' 'loan_grade_E' 'loan_grade_F' 'loan_grade_G' 'cb_person_default_on_file_N' 'cb_person_default_on_file_Y']
# # Attributes are positively correlated and negatively correlated :
# the higher the weight of the predictor and more important, the lower the less important
n = len(model.coef_.tolist())
pd.DataFrame({
'Attributes':(numeric_cols + encoded_cols),
'Predicters':model.coef_.tolist()[0]
})
| Attributes | Predicters | |
|---|---|---|
| 0 | person_age | -0.396663 |
| 1 | person_income | -0.349462 |
| 2 | person_emp_length | -0.224794 |
| 3 | loan_amnt | -3.178826 |
| 4 | loan_int_rate | 1.071667 |
| 5 | loan_percent_income | 6.043506 |
| 6 | cb_person_cred_hist_length | 0.137743 |
| 7 | person_home_ownership_MORTGAGE | -0.001849 |
| 8 | person_home_ownership_OTHER | 0.114486 |
| 9 | person_home_ownership_OWN | -1.710416 |
| 10 | person_home_ownership_RENT | 0.613574 |
| 11 | loan_intent_DEBTCONSOLIDATION | 0.197946 |
| 12 | loan_intent_EDUCATION | -0.587084 |
| 13 | loan_intent_HOMEIMPROVEMENT | 0.487668 |
| 14 | loan_intent_MEDICAL | 0.011942 |
| 15 | loan_intent_PERSONAL | -0.290705 |
| 16 | loan_intent_VENTURE | -0.803971 |
| 17 | loan_grade_A | -1.989577 |
| 18 | loan_grade_B | -1.747757 |
| 19 | loan_grade_C | -1.567126 |
| 20 | loan_grade_D | 0.447969 |
| 21 | loan_grade_E | 0.756114 |
| 22 | loan_grade_F | 0.849520 |
| 23 | loan_grade_G | 2.266652 |
| 24 | cb_person_default_on_file_N | -0.530982 |
| 25 | cb_person_default_on_file_Y | -0.453223 |
# Visualizing the logsitic regression coef
n = len(model.coef_.tolist())
weight_df = pd.DataFrame({
'feature':(numeric_cols + encoded_cols),
'weight':model.coef_.tolist()[0]
})
sns.barplot(data = weight_df.sort_values('weight', ascending = False).head(25), x = 'weight', y = 'feature')
<AxesSubplot:xlabel='weight', ylabel='feature'>
# Evaluating the model
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols]
train_preds = model.predict(X_train)
train_preds
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
# The model achives an accuracy of 86% of training set
accuracy_score(train_targets, train_preds)
0.8648081904526952
confusion_matrix(train_targets, train_preds)
array([[12206, 596],
[ 1596, 1816]], dtype=int64)
# Visualization
def predict_and_plot(inputs, targets, name =''):
preds = model.predict(inputs)
accuracy = accuracy_score(targets, preds)
print("Accuracy: {:.2f}%".format(accuracy * 100))
cf = confusion_matrix(targets, preds, normalize = 'true')
plt.figure()
sns.heatmap(cf, annot = True)
plt.xlabel('Prediction')
plt.ylabel('Target')
plt.title('{} Confusion Matrix'.format(name));
return preds
train_preds = predict_and_plot(X_train, train_targets, 'Training')
Accuracy: 86.48%
test_preds = predict_and_plot(X_test, test_targets, 'Test')
Accuracy: 71.29%
print(classification_report(test_targets, test_preds))
precision recall f1-score support
0 0.80 0.85 0.83 4297
1 0.23 0.17 0.19 1108
accuracy 0.71 5405
macro avg 0.51 0.51 0.51 5405
weighted avg 0.68 0.71 0.70 5405
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier(max_depth = 5)
dt_clf.fit(X_train, train_targets)
dt_clf.score(X_test, test_targets)
0.7250693802035153
train_preds = dt_clf.predict(X_test)
dt_clf.score(X_test, test_targets)
confusion_matrix(test_targets, train_preds)
array([[3761, 536],
[ 950, 158]], dtype=int64)
print(classification_report(test_targets, train_preds))
precision recall f1-score support
0 0.80 0.88 0.84 4297
1 0.23 0.14 0.18 1108
accuracy 0.73 5405
macro avg 0.51 0.51 0.51 5405
weighted avg 0.68 0.73 0.70 5405
from sklearn import ensemble
rf_clf = ensemble.RandomForestClassifier(n_estimators = 100)
rf_clf.fit(X_train, train_targets)
rf_clf.score(X_test, test_targets)
0.7110083256244218
train_preds = rf_clf.predict(X_test)
rf_clf.score(X_test, test_targets)
confusion_matrix(test_targets, train_preds)
array([[3671, 626],
[ 936, 172]], dtype=int64)
print(classification_report(test_targets, train_preds))
precision recall f1-score support
0 0.80 0.85 0.82 4297
1 0.22 0.16 0.18 1108
accuracy 0.71 5405
macro avg 0.51 0.50 0.50 5405
weighted avg 0.68 0.71 0.69 5405
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
nb_clf.fit(X_train, train_targets)
nb_clf.score(X_test, test_targets)
0.7150786308973173
train_preds = nb_clf.predict(X_test)
nb_clf.score(X_test, test_targets)
confusion_matrix(test_targets, train_preds)
array([[3685, 612],
[ 928, 180]], dtype=int64)
print(classification_report(test_targets, train_preds))
precision recall f1-score support
0 0.80 0.86 0.83 4297
1 0.23 0.16 0.19 1108
accuracy 0.72 5405
macro avg 0.51 0.51 0.51 5405
weighted avg 0.68 0.72 0.70 5405